full_data <- readRDS('data/full_data.rds')

Compute Daily Max H2S

H2S_dm <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 123 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1724: `day = 2020-08-22`, `Monitor = "Judson"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 122 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
  mutate(yearmonth = format(day, "%Y-%m")) %>%
  group_by(yearmonth, Monitor) %>%
  summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
  scale_y_continuous(limits=c(0, 50)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')

The line with the largest deviation/peak corresponds to the 213th&Chico monitor, in 2021-10 Maybe we should start from January 2022 to start modelling to remove the extreme values, or simply removing the 213 monitor since that’s set up specifically for the event.

Daily average

# Compute daily average
H2S_da <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_da_graph <- H2S_da %>%
 ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Average H2S concentration by monitor", y = 'Daily Average H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_da_graph) %>% layout(dragmode = 'pan')
H2S_da_graph_b <- H2S_da %>%
 ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Average H2S concentration by monitor w/o outlier", y = 'Daily Avearge H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
full_data %>%
  polarPlot(pollutant = "H2S", col = "turbo", 
            key.position = "bottom",
            key.header = "mean H2S", 
            key.footer = NULL, title = 'hi')

# Create a polar map
# TBD: include markers for refineries
polarMap(
  full_data %>% 
    filter(!(is.na(latitude) | is.na(H2S) | is.na(wd) | is.na(ws))) %>%
    rename(date = DateTime),
  pollutant = "H2S",
  latitude = "latitude",
  longitude = "longitude",
  popup = "Monitor",
  provider = "Esri.WorldImagery",
  d.icon = 150,
  d.fig = 2.5,
  alpha = 0.75
)
## 
Creating Polar Markers â– â– â– â– â– â–  15% | ETA: 12s

Creating Polar Markers â– â– â– â– â– â– â– â–  23%
## | ETA: 10s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â–  31% | ETA: 8s

Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â–  38% | ETA: 7s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  46% | ETA:
## 6s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  54% | ETA: 5s

Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  62% | ETA: 4s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 
## 69% | ETA: 3s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  77% | ETA:
## 3s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  85% | ETA: 2s

Creating Polar
## Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  92% | ETA: 1s

# Compute daily average wd/ws
daily_full <- full_data %>%
              select(c(H2S, ws, wd, latitude, longitude, Monitor, MinDist, 
                       Converted_Angle, Refinery, year, month, day, weekday)) %>%
  group_by(Monitor, day, latitude, longitude, Refinery, Converted_Angle, year, 
           month, MinDist, weekday) %>%
  summarise(H2S_daily_max = max(H2S, na.rm=TRUE),
            H2S_daily_avg = mean(H2S, na.rm=TRUE),
            ws_avg = mean(ws, na.rm=TRUE),
            wd_avg = mean(wd, na.rm=TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 124 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1637: `Monitor = "First Methodist"`, `day = 2022-11-23`, `latitude =
##   33.78223`, `longitude = -118.2818`, `Refinery = "Phillips 66 (Wilmington)"`,
##   `Converted_Angle = 209`, `year = "2022"`, `month = "11"`, `MinDist =
##   2119.191`, `weekday = Wed`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 123 remaining warnings.
## `summarise()` has grouped output by 'Monitor', 'day', 'latitude', 'longitude',
## 'Refinery', 'Converted_Angle', 'year', 'month', 'MinDist'. You can override
## using the `.groups` argument.

Explore some GAM models

Five minute H2S

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        1.219e+00  7.067e-02  17.246  < 2e-16 ***
## year2021                           2.969e-01  3.968e-02   7.483 7.26e-14 ***
## year2022                          -7.641e-01  4.439e-02 -17.214  < 2e-16 ***
## year2023                          -6.640e-01  5.647e-02 -11.759  < 2e-16 ***
## wd_secQ2                          -3.976e-02  4.194e-02  -0.948   0.3432    
## wd_secQ3                           2.062e-01  4.260e-02   4.842 1.29e-06 ***
## wd_secQ4                           8.677e-02  4.028e-02   2.154   0.0312 *  
## ws                                -1.184e-01  4.547e-03 -26.031  < 2e-16 ***
## I(1/MinDist^2)                     1.865e+05  3.862e+04   4.828 1.38e-06 ***
## RefineryMarathon (Carson)          2.788e+00  5.863e-02  47.557  < 2e-16 ***
## RefineryMarathon (Wilmington)      7.871e-01  6.773e-02  11.622  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -1.573e-01  6.726e-02  -2.339   0.0194 *  
## RefineryPhillips 66 (Wilmington)   3.108e-01  6.561e-02   4.738 2.16e-06 ***
## RefineryTorrance Refinery          3.250e-02  5.499e-02   0.591   0.5545    
## RefineryValero Refinery            6.458e-01  6.617e-02   9.760  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.997      8 1198  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00935   Deviance explained = 0.936%
## GCV = 291.75  Scale est. = 291.75    n = 1988825
plot(h2s_model_a)

h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
                   Refinery+s(latitude, longitude, bs='tp', k = 5), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) + 
##     Refinery + s(latitude, longitude, bs = "tp", k = 5)
## 
## Parametric coefficients:
##                                     Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                       -1.348e-01  3.425e-02   -3.936 8.28e-05 ***
## wd_secQ2                          -1.853e-01  3.678e-03  -50.388  < 2e-16 ***
## wd_secQ3                          -1.666e-01  3.890e-03  -42.824  < 2e-16 ***
## wd_secQ4                          -9.164e-02  3.475e-03  -26.372  < 2e-16 ***
## ws                                -6.987e-02  4.145e-04 -168.538  < 2e-16 ***
## I(1/MinDist^2)                     3.018e+05  3.351e+03   90.066  < 2e-16 ***
## RefineryMarathon (Carson)          8.595e-01  3.705e-02   23.201  < 2e-16 ***
## RefineryMarathon (Wilmington)      1.937e+00  3.896e-02   49.718  < 2e-16 ***
## RefineryPhillips 66 (Wilimington)  2.743e+00  4.291e-02   63.928  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)   3.613e+00  4.473e-02   80.779  < 2e-16 ***
## RefineryTorrance Refinery         -3.377e+00  6.407e-02  -52.710  < 2e-16 ***
## RefineryValero Refinery            2.508e+00  4.323e-02   58.006  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df    F p-value    
## s(as.numeric(month))  7.994      8 1858  <2e-16 ***
## s(latitude,longitude) 4.000      4 3137  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.149   Deviance explained = 14.9%
## GCV = 0.87876  Scale est. = 0.87874   n = 764964
plot(h2s_model_b)

# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws + 
                     I(1/MinDist^2) + Converted_Angle + Refinery + 
                     s(latitude, longitude, bs='tp', k = 10) + 
                     as.factor(weekday), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Converted_Angle + Refinery + s(latitude, 
##     longitude, bs = "tp", k = 10) + as.factor(weekday)
## 
## Parametric coefficients:
##                                     Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                        1.3630775 11.2200959    0.121   0.9033    
## year2023                          -0.3241836  0.0028931 -112.056  < 2e-16 ***
## wd_secQ2                          -0.1824656  0.0036362  -50.180  < 2e-16 ***
## wd_secQ3                          -0.1624306  0.0038477  -42.215  < 2e-16 ***
## wd_secQ4                          -0.1014221  0.0034359  -29.518  < 2e-16 ***
## ws                                -0.0675438  0.0004108 -164.435  < 2e-16 ***
## I(1/MinDist^2)                     0.0001051  0.0006206    0.169   0.8655    
## Converted_Angle                    0.0010137  0.0111029    0.091   0.9273    
## RefineryMarathon (Carson)          0.4583756  9.8623512    0.046   0.9629    
## RefineryMarathon (Wilmington)      0.1483215 10.9929681    0.013   0.9892    
## RefineryPhillips 66 (Wilimington) -1.4289746 11.5737536   -0.123   0.9017    
## RefineryPhillips 66 (Wilmington)  -1.1623978 12.3479442   -0.094   0.9250    
## RefineryTorrance Refinery          0.3999050  6.8572067    0.058   0.9535    
## RefineryValero Refinery           -1.0866519 12.1482149   -0.089   0.9287    
## as.factor(weekday).L               0.0993529  0.0028026   35.451  < 2e-16 ***
## as.factor(weekday).Q              -0.1711193  0.0028096  -60.905  < 2e-16 ***
## as.factor(weekday).C              -0.0048304  0.0028039   -1.723   0.0849 .  
## as.factor(weekday)^4              -0.0207098  0.0028135   -7.361 1.83e-13 ***
## as.factor(weekday)^5              -0.0591083  0.0028069  -21.058  < 2e-16 ***
## as.factor(weekday)^6              -0.0269018  0.0028188   -9.544  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df       F p-value    
## s(as.numeric(month))  7.997      8 2662.73  <2e-16 ***
## s(latitude,longitude) 4.000      4    0.22   0.927    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 36/37
## R-sq.(adj) =  0.168   Deviance explained = 16.8%
## GCV = 0.85828  Scale est. = 0.85824   n = 764964
plot(h2s_model_c)

Daily max H2S

# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg + 
##     ws_avg + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        6.056e-01  5.391e-01   1.123 0.261378    
## year2023                          -5.223e-01  2.149e-01  -2.430 0.015164 *  
## wd_avg                             6.598e-03  2.205e-03   2.993 0.002788 ** 
## ws_avg                            -2.853e-01  6.503e-02  -4.388 1.19e-05 ***
## I(1/MinDist^2)                     3.235e-06  4.297e-07   7.529 7.01e-14 ***
## RefineryMarathon (Carson)          1.236e+00  3.266e-01   3.785 0.000157 ***
## RefineryMarathon (Wilmington)      2.975e+00  3.771e-01   7.890 4.40e-15 ***
## RefineryPhillips 66 (Wilimington)  4.567e-01  3.786e-01   1.206 0.227788    
## RefineryPhillips 66 (Wilmington)   2.341e+00  3.243e-01   7.220 6.76e-13 ***
## RefineryTorrance Refinery          1.204e+00  3.454e-01   3.487 0.000497 ***
## RefineryValero Refinery            6.020e+00  3.809e-01  15.803  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F  p-value    
## s(as.numeric(month)) 2.273      8 2.702 3.76e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 18/19
## R-sq.(adj) =  0.121   Deviance explained = 12.5%
## GCV = 19.227  Scale est. = 19.138    n = 2637
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + s(latitude, longitude, bs = "tp", 
##     k = 10)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        13.278600  19.964624   0.665  0.50604    
## wd_avg                              0.006495   0.002230   2.912  0.00362 ** 
## ws_avg                             -0.344070   0.064991  -5.294  1.3e-07 ***
## I(1/MinDist^2)                      0.001695   0.002483   0.683  0.49481    
## RefineryMarathon (Carson)          -6.251628  24.588190  -0.254  0.79932    
## RefineryMarathon (Wilmington)     -10.237090  26.040769  -0.393  0.69426    
## RefineryPhillips 66 (Wilimington) -19.210513  23.018650  -0.835  0.40404    
## RefineryPhillips 66 (Wilmington)  -19.586228  23.546540  -0.832  0.40559    
## RefineryTorrance Refinery          -3.383526  17.084565  -0.198  0.84302    
## RefineryValero Refinery           -14.838330  27.412552  -0.541  0.58835    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F  p-value    
## s(as.numeric(month))  2.109  8.000 2.015 5.71e-05 ***
## s(latitude,longitude) 4.916  4.995 3.226  0.00842 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 26/27
## R-sq.(adj) =  0.126   Deviance explained = 13.1%
## GCV = 19.142  Scale est. = 19.026    n = 2637
plot(h2s_dm_model_b)

# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+Converted_Angle+
                        s(latitude, longitude, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + Converted_Angle + s(latitude, 
##     longitude, bs = "tp", k = 10)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        5.958e+00  4.228e+01   0.141  0.88795    
## wd_avg                             6.492e-03  2.230e-03   2.911  0.00363 ** 
## ws_avg                            -3.441e-01  6.499e-02  -5.295 1.29e-07 ***
## I(1/MinDist^2)                     6.486e-04  2.227e-03   0.291  0.77085    
## RefineryMarathon (Carson)         -7.569e-01  3.717e+01  -0.020  0.98376    
## RefineryMarathon (Wilmington)     -3.881e+00  4.146e+01  -0.094  0.92541    
## RefineryPhillips 66 (Wilimington) -1.190e+01  4.370e+01  -0.272  0.78542    
## RefineryPhillips 66 (Wilmington)  -1.167e+01  4.664e+01  -0.250  0.80248    
## RefineryTorrance Refinery          4.353e-01  2.583e+01   0.017  0.98656    
## RefineryValero Refinery           -7.588e+00  4.588e+01  -0.165  0.86864    
## Converted_Angle                    8.196e-03  4.190e-02   0.196  0.84493    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F  p-value    
## s(as.numeric(month))  2.109  8.000 2.015 5.71e-05 ***
## s(latitude,longitude) 3.919  3.995 0.552    0.683    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 27/28
## R-sq.(adj) =  0.126   Deviance explained = 13.1%
## GCV = 19.142  Scale est. = 19.026    n = 2637
plot(h2s_dm_model_c)

### Daily Average

# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+Converted_Angle+
                        s(latitude, longitude, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + Converted_Angle + s(latitude, 
##     longitude, bs = "tp", k = 10)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        1.1862179  8.6715071   0.137    0.891    
## wd_avg                             0.0016717  0.0002515   6.647 3.63e-11 ***
## ws_avg                            -0.1251589  0.0075858 -16.499  < 2e-16 ***
## I(1/MinDist^2)                     0.0001115  0.0004719   0.236    0.813    
## RefineryMarathon (Carson)          0.3357134  7.6229727   0.044    0.965    
## RefineryMarathon (Wilmington)      0.0362523  8.4981330   0.004    0.997    
## RefineryPhillips 66 (Wilimington) -1.6291837  8.9508408  -0.182    0.856    
## RefineryPhillips 66 (Wilmington)  -1.3536383  9.5501570  -0.142    0.887    
## RefineryTorrance Refinery          0.4037375  5.2994671   0.076    0.939    
## RefineryValero Refinery           -1.1761174  9.3954958  -0.125    0.900    
## Converted_Angle                    0.0005512  0.0085852   0.064    0.949    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df      F p-value    
## s(as.numeric(month))  7.678      8 17.861  <2e-16 ***
## s(latitude,longitude) 3.975      4  0.354    0.84    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 27/28
## R-sq.(adj) =  0.344   Deviance explained = 34.9%
## GCV = 0.24016  Scale est. = 0.23819   n = 2637
plot(h2s_da_model_a)

# Include year too, since we have 2023 now
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+year+
                        I(1/MinDist^2)+Refinery+Converted_Angle+
                        s(latitude, longitude, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     year + I(1/MinDist^2) + Refinery + Converted_Angle + s(latitude, 
##     longitude, bs = "tp", k = 10)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        1.001e+00  8.495e+00   0.118    0.906    
## wd_avg                             1.580e-03  2.432e-04   6.498 9.73e-11 ***
## ws_avg                            -1.134e-01  7.382e-03 -15.357  < 2e-16 ***
## year2023                          -3.441e-01  2.524e-02 -13.637  < 2e-16 ***
## I(1/MinDist^2)                     9.172e-05  4.624e-04   0.198    0.843    
## RefineryMarathon (Carson)          4.142e-01  7.468e+00   0.055    0.956    
## RefineryMarathon (Wilmington)      2.009e-01  8.325e+00   0.024    0.981    
## RefineryPhillips 66 (Wilimington) -1.298e+00  8.768e+00  -0.148    0.882    
## RefineryPhillips 66 (Wilmington)  -1.000e+00  9.355e+00  -0.107    0.915    
## RefineryTorrance Refinery          4.296e-01  5.192e+00   0.083    0.934    
## RefineryValero Refinery           -9.355e-01  9.204e+00  -0.102    0.919    
## Converted_Angle                    9.287e-04  8.410e-03   0.110    0.912    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df      F p-value    
## s(as.numeric(month))  7.563      8 27.054  <2e-16 ***
## s(latitude,longitude) 3.975      4  0.309   0.871    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 28/29
## R-sq.(adj) =  0.387   Deviance explained = 39.2%
## GCV = 0.22443  Scale est. = 0.22251   n = 2637
plot(h2s_da_model_b)

Monthly

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.235e+00  6.355e-01   3.517 0.000436 ***
## year2021                           7.458e+00  3.593e-01  20.756  < 2e-16 ***
## year2022                          -9.774e+00  3.997e-01 -24.452  < 2e-16 ***
## year2023                          -3.496e+00  5.072e-01  -6.893 5.47e-12 ***
## wd_secQ2                          -1.640e-01  3.761e-01  -0.436 0.662756    
## wd_secQ3                           3.453e+00  3.826e-01   9.025  < 2e-16 ***
## wd_secQ4                          -1.879e+00  3.616e-01  -5.195 2.05e-07 ***
## ws                                 1.865e-01  4.068e-02   4.584 4.57e-06 ***
## I(1/MinDist^2)                     7.430e+04  3.464e+05   0.214 0.830190    
## RefineryMarathon (Carson)          4.270e+01  5.219e-01  81.819  < 2e-16 ***
## RefineryMarathon (Wilmington)      4.365e+00  6.066e-01   7.197 6.16e-13 ***
## RefineryPhillips 66 (Wilimington)  2.101e+00  6.012e-01   3.495 0.000475 ***
## RefineryPhillips 66 (Wilmington)   4.202e+00  5.879e-01   7.147 8.86e-13 ***
## RefineryTorrance Refinery         -3.120e+00  4.920e-01  -6.341 2.28e-10 ***
## RefineryValero Refinery            6.121e+00  5.915e-01  10.349  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.999      8 4060  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0273   Deviance explained = 2.73%
## GCV =  24107  Scale est. = 24107     n = 2048887
plot(h2s_ma_model_a)

h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), data = full_data)
summary(h2s_ma_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude, longitude, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -246.03611 2944.04261  -0.084 0.933398    
## year2021                             4.47329    0.34876  12.826  < 2e-16 ***
## year2022                            -5.14817    0.39702 -12.967  < 2e-16 ***
## year2023                             7.43295    0.50200  14.807  < 2e-16 ***
## wd_secQ2                             0.76576    0.34714   2.206 0.027391 *  
## wd_secQ3                             0.03907    0.35363   0.110 0.912022    
## wd_secQ4                            -1.20988    0.33417  -3.621 0.000294 ***
## ws                                  -0.01555    0.03757  -0.414 0.678875    
## I(1/MinDist^2)                       1.00621    0.17410   5.779  7.5e-09 ***
## RefineryMarathon (Carson)          118.80243 3863.13511   0.031 0.975467    
## RefineryMarathon (Wilmington)     1093.88472 4061.43332   0.269 0.787672    
## RefineryPhillips 66 (Wilimington)  676.96024 3532.53821   0.192 0.848028    
## RefineryPhillips 66 (Wilmington)   714.49113 3595.84938   0.199 0.842498    
## RefineryTorrance Refinery         -125.60869 2722.66743  -0.046 0.963203    
## RefineryValero Refinery            693.54463 4213.77464   0.165 0.869267    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value    
## s(as.numeric(month))    8      8  4837  <2e-16 ***
## s(latitude,longitude)   6      6 57858  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 31/32
## R-sq.(adj) =  0.172   Deviance explained = 17.2%
## GCV =  20524  Scale est. = 20524     n = 2048586
plot(h2s_ma_model_b)

Try XGBoost on Daily Max

daily_full <- daily_full %>%
  mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
         Monitor = str_replace_all(Refinery, ' ', '_'),
         weekday = as.character(weekday))

## Try for a continuous month
# tune_grid <- expand.grid(nrounds = c(100, 200),
#                          max_depth = c(2, 3, 4),
#                          eta = c(0.1, 0.3),
#                          gamma = c(0.01, 0.001),
#                          colsample_bytree = c(0.5, 1),
#                          min_child_weight = 0,
#                          subsample = c(0.5, 0.75, 1))
# # Run algorithms using 10-fold cross validation
# control <- trainControl(method="cv", number=10,verboseIter=TRUE, search='grid')

fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~., 
#                  method = 'xgbTree', 
#                  data = fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>% 
#                    ungroup() %>% 
#                    filter(day >= '2022-02-01') %>% 
#                    select(-c(Refinery, latitude, longitude, day, H2S_daily_max)) %>%
#                    mutate(MinDist = 1/(MinDist^2)), 
#                    remove_selected_columns = TRUE)
#                  ,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 321.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 32 
## niter: 200
## nfeatures : 32 
## xNames : Converted_Angle MinDist ws_avg wd_avg Monitor_Chevron_El_Segundo Monitor_Marathon_Carson Monitor_Marathon_Wilmington Monitor_Phillips_66_Wilimington Monitor_Phillips_66_Wilmington Monitor_Torrance_Refinery Monitor_Valero_Refinery year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 70     200         4 0.1  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top=10)

SHAP for XGBoost above (Caret)

matrix <- fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>% 
                   ungroup() %>% 
                   filter(day >= '2022-02-01') %>% 
                   select(-c(Refinery, latitude, longitude, day, H2S_daily_max, H2S_daily_avg)) %>%
                   mutate(MinDist = 1/(MinDist^2)), 
                   remove_selected_columns = TRUE)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 10)